/*
Read in, deflate, and clean manufacturing data from ASM and CMF
Merge in initial and current energy prices
Merge in IV
*/

clear
local dataRAW "/data/economic/"
local dataSTATA "/projects/data/dataSTATA"
set maxvar 32767 

log using "/projects/logs/cleanmanuf2.log", append


*>----------------- loop over and append all asm and cmf files ----------------<

local i = 0
foreach y of numlist 1976(1)2016 {
	if inlist(`y',1977, 1982, 1987, 1992, 1997, 2002, 2007, 2012) {
		import sas using "/data/economic/cmf/`y'/cmf`y'.sas7bdat", case(lower) clear
		gen ks = tae
		la var ks "Capital Stock (total assets ending, 1000 USD, Current)"
	}
	else {
		import sas using "/data/economic/asm/`y'/asm`y'.sas7bdat", case(lower) clear
		* generate empty capital stock variable, which is missing from asm
		gen ks = .
		la var ks "Capital Stock (total assets ending, 1000 USD, Current)"
	}
	replace year = `y'
	drop if mi(lbdnum)
	destring fipsst, replace

	* electricity prices
	gen elec_price = ee / pe
	la var elec_price "Avg Electricty Purchase Price ($ per kWh, Current USD)"

	* electricity intensity
	gen elec_int = pe / tvs
	la var elec_int "Electricity Intensity (kWh per $ Revenue, Current USD)"

	* labor inputs -- total production worker hours x total payroll / production worker payroll
	* reference to GSW
	gen cl = ph * ( sw / ww )
	la var cl "Total Labor Hours, Adjusted (1000s)"
	
	gen wage = ww / ph
	gen wage_all = sw / cl
	la var wage "Wage ($ per hr)"

	* capital inputs 
	gen ck = tce
	la var ck "Total Capital Expenditures (1000 USD)"
	
	* gen costs of fuels and electricity
	gen cfe = cf + ee
	la var cfe "Total Costs of Fuel and Electricity (1000 USD)"

	* cleaning
	* 1. drop negative or missing values for main variables
	foreach var of varlist elec_price elec_int ck tvs cl cm ee cf {
		drop if `var' ==. | `var' < 0
	}
	
	* 2. drop firms reporting 0 output, labor, materials, electricity inputs, or electricity prices
	foreach var of varlist elec_price tvs cl cm ee {
		drop if `var' == 0
	}
		
	* 3. drop administrative records -- reference to GSW
	drop if ar == 1
	
	* 4. exclude if any input values are greater than 100 x 99th percentile of the distribution
	foreach var of varlist ck tvs cl cm ee cf {
		sum `var', detail
		local p99 = `r(p99)' * 100
		drop if  `var' > `p99'
	}
	
	* capital stock var is in CMF only -- clean separately
	if inlist(`y',1977, 1982, 1987, 1992, 1997, 2002, 2007, 2012, 2017) {
		drop if mi(ks) | ks < 0 
		
		sum ks, detail
		local p99 = `r(p99)' * 100
		drop if  ks > `p99' 
	}
	

	* 5. exclude energy prices greater than ten times or less than one-tenth of the median price
	* in a given industry-year 
	sum elec_price, detail
	local up_bound = `r(p50)' * 10
	local low_bound = `r(p50)'/ 10
	drop if elec_price > `up_bound' | elec_price < `low_bound'
	
	* just keep variables of interest here
	keep elec_price ck tvs ks cl cf cm state postalst wt lbdnum fipsst ge pe ee year firmid cfe wage wage_all

	* merge in lbd
	preserve
		import sas using "/data/economic/lbd/`y'/lbd`y'c_c201600.sas7bdat", case(lower) clear
		drop if mi(lbdnum) | mi(firstyear)
		replace yr = `y'
		tempfile lbd`y'
		sa `lbd`y'', replace
	restore
	
	duplicates drop lbdnum, force

	merge 1:1 lbdnum using `lbd`y'', keepusing(bestsic bestnaics firstyear lastyear state county zip firmid flagb) assert(1 2 3) keep(3) nogen

	* concord all industries to NAICS 1997
	* we keep bestsic as the original sic from lbd
	if `y' < 1987 {
		* concordance is at 4-digit level
		gen bestsic4 = substr(bestsic,1,4)
		destring bestsic4, replace
		preserve
			import delimited "/projects/dstafftransfer/transfer.20200428/conc_sic72_sic87.csv", clear
			duplicates drop sic72, force
			tempfile concord72`y'
			sa `concord72`y'', replace
		restore
		ren bestsic4 sic72
		merge m:1 sic72 using `concord72`y'', assert(1 2 3) keep(1 3) keepusing(sic72 sic87) nogen
		drop sic72
		gen bestsic4 = substr(bestsic,1,4)
		destring bestsic4, replace
		replace bestsic4 = sic87 if !mi(sic87)
	}
	if `y' < 2001 & `y' > 1986 {
		gen bestsic4 = substr(bestsic,1,4)
		destring bestsic4, replace
	}
	
	if `y' < 2001 {
		preserve
			import delimited "/projects/dstafftransfer/transfer.20200428/conc_sic87_naics97.csv", clear
			duplicates drop sic, force
			ren sic bestsic4
			tempfile concord87`y'
			sa `concord87`y'', replace
		restore
		merge m:1 bestsic4 using `concord87`y'', assert(1 2 3) keep(1 3) keepusing(bestsic4 naics97) nogen
		la var bestsic4 "SIC 1987 Rev."
	
		replace bestnaics = "" if mi(bestnaics)
	
		gen last_naics = substr(bestnaics,8,1)
		capture confirm numeric variable last_naics
		gen byte notnumeric = real(last_naics) == .
		replace bestnaics = substr(bestnaics,1,7) + "0" if notnumeric == 1
		drop last_naics notnumeric
	
		destring bestnaics, replace
		replace bestnaics = naics97 if mi(bestnaics) | bestnaics == 0 
	}
	if `y' > 2000 {
		replace bestnaics = "" if mi(bestnaics)
	
		* a few naics codes end in letters -- correct this
		gen last_naics = substr(bestnaics,8,1)
		capture confirm numeric variable last_naics
		gen byte notnumeric = real(last_naics) == .
		replace bestnaics = substr(bestnaics,1,7) + "0" if notnumeric == 1
		drop last_naics notnumeric
	
		destring bestnaics, replace		
	}
		
	la var bestnaics "NAICS Rev. 1997 Ind. Code"
	replace bestsic = "" if mi(bestsic)
	destring bestsic, replace
	
	sum		
	
	if `i' == 1 append using "/projects/data/dataSTATA/economic/asm_cmf_merged.dta"
	local i = 1
	sa "/projects/data/dataSTATA/economic/asm_cmf_merged.dta", replace
}


*>---------------------------- set up deflators -------------------------------<

import delimited "/projects/dstafftransfer/transfer.20200428/naics5811.csv", clear 
* relevant deflators:
* - piship -- value of shipments
foreach var of varlist piship pimat pien piinv {
	gen `var'_n = `var' if year == 2011
	bys naics: egen `var'_2011 = mean(`var'_n)
	gen `var'_dfltr = `var'_2011 / `var'
	drop `var'_n `var'_2011 
	
}
keep naics year *_dfltr
ren naics bestnaics

tempfile deflators
sa `deflators', replace

* for energy prices in electricity sector, we will use average of energy prices deflators in our industries (referenced to GSW)
collapse (mean) pien_dfltr, by(year)
ren pien_dfltr pien_avg
tempfile energy_deflators
sa `energy_deflators', replace

*>-------------------------- set up and deflate IV ----------------------------<


import delimited "/projects/dstafftransfer/transfer.20200824/seds_iv.csv", case(preserve) encoding(ISO-8859-2) clear
* keep only electricity prices and lagged shares
keep ef* L* year state
ren state postalst
merge m:1 year using `energy_deflators', assert(1 2 3) keep(1 3) keepusing(year pien_avg) nogen

* use post 2016 when it becomes available
* deflate prices, relabel vars
foreach var in cl ng pa ww nu{
	replace efPriceNational`var' = efPriceNational`var' * pien_avg
	label var efPriceNational`var' "`var': Real National Price for Elec. Gen. (Leave out, Consumption Weighted Mean)"
}
drop pien_avg
foreach var in 0 2 5{
	foreach var2 in cl ng pa ww nu{
		la var L`var'elecShareB_`var2' "Lag `var' of `var2' Share in Elec. Power Sector (Btu)" 
		la var L`var'elecShareV_`var2' "Lag `var' of `var2' Share in Elec. Power Sector (Expenditure)"
	}
}
foreach var in 0 2 5{
	foreach var2 in ge hy{
		la var L`var'elecShareB_`var2' "Lag `var' of `var2' Share in Elec. Power Sector (Btu)"	
	}
}

tempfile ivvars
sa `ivvars', replace

* now save a version with the instruments for initial energy prices
ren year firstyear
foreach var in cl ng pa ww nu{
	ren efPriceNational`var' efPriceNational`var'_init 
}
foreach var in 0 2 5{
	foreach var2 in cl ng pa ww nu{
		ren L`var'elecShareB_`var2' L`var'elecShareB_`var2'_init
		ren L`var'elecShareV_`var2' L`var'elecShareV_`var2'_init
	}
}
foreach var in 0 2 5{
	foreach var2 in ge hy{
		ren L`var'elecShareB_`var2' L`var'elecShareB_`var2'_init 	
	}
}

tempfile ivvars_init
sa `ivvars_init', replace
sa "/projects/data/dataSTATA/temp/ivvars_init.dta", replace


*>---------------------- deflate manuf data, merge with IV --------------------<

use "/projects/data/dataSTATA/economic/asm_cmf_merged.dta", clear

sum

* some NAICS codes have two extra digits depending on year -- problematic for merge with deflators
tostring bestnaics, gen(naics6)
replace naics6 = substr(naics6,1,6)
destring naics6, replace
replace bestnaics = naics6
drop naics6


* merge with IV vars
merge m:1 postalst year       using `ivvars', assert(1 2 3) keep(3) nogen
merge m:1 postalst firstyear  using `ivvars_init', assert(1 2 3) keep(1 3) nogen

* bring in NBER deflators here
merge m:1 bestnaics year using `deflators', assert(1 2 3) keep(1 3) nogen

drop if bestnaics > 399999

gen naics4 = floor(bestnaics/100)
foreach var in pien_dfltr piinv_dfltr piship_dfltr pimat_dfltr {
	bys naics4 year: egen `var'_avg = mean(`var')
	replace `var' = `var'_avg if mi(`var')
}
drop naics4

* deflate
replace elec_price = elec_price * pien_dfltr
la var elec_price "Avg. Price of Purchased Electricity, Calc. ($ per kWh, 2011 USD)"

replace ck = ck * piinv_dfltr
la var ck "Total Capital Expenditures (1000 2011 USD)"

replace ks = ks * piinv_dfltr
la var ks "Capital Stock (total assets ending, 1000 2011 USD)"

replace tvs = tvs * piship_dfltr
la var tvs "Total Value of Shipments (1000 2011 USD)"

replace cf = cf * pien_dfltr
la var cf "Cost of Fuels, Including Electricity (1000 2011 USD)"

replace cm = cm * pimat_dfltr
la var cm "Cost of Materials (1000 2011 USD)"

replace ee = ee * pien_dfltr
la var ee "Cost of Purchased Electricity (1000 2011 USD)"

replace cfe = cfe * pien_dfltr
la var cfe "Total Cost of Fuels and Electricity (1000 2011 USD)"

gen elec_int = pe / tvs
la var elec_int "Electricity Intensity, Calc. (kWh per $ revenue, 2011 USD)"

replace wage = wage * pimat_dfltr
la var wage "Wage ($ per hr, 2011 USD)"

replace wage_all = wage_all * pimat_dfltr
la var wage_all "Wage All Workers ($ per hr, 2011 USD)"

foreach var of varlist elec_price ck ks tvs cf cm ee elec_int cl wage {
	gen l`var' = log(`var')
	la var l`var' "log of `var'"
}
drop if elec_price == 0


sa "/projects/data/dataSTATA/temp/combined_economic.dta", replace

drop if firstyear == 1975
sa "/projects/data/dataSTATA/temp/combined_working.dta", replace


*>-------------------- calculate energy prices by entry date ------------------<

** initial energy prices if you are in the cmf or asm in the first year you are in the lbd -- then we know exactly what your price is
use "/projects/data/dataSTATA/temp/combined_economic.dta", clear
keep if firstyear == year
ren elec_price elec_price_init
la var elec_price_init "Price of Purchased Electricity in Entry Year ($ per kWh, 2011 USD)"
gen lelec_price_init = log(elec_price_init)
la var lelec_price_init "log of mean price of purchased electricity in entry year, industry, state ($ per kWh, 2011 USD)"

sum elec_price_init, detail
bys state: sum elec_price_init

* lbdnum uniquely identifies plants over time
keep firstyear elec_price_init lelec_price_init lbdnum
sa "/projects/data/dataSTATA/economic/init_prices_i.dta", replace



** initial energy price average by firstyear state bestnaics
use "/projects/data/dataSTATA/temp/combined_economic.dta", clear
keep if year == firstyear
collapse (mean) elec_price, by(year bestnaics state)
ren elec_price elec_price_init_avg
la var elec_price_init_avg "mean price of purchased electricity in entry year, industry, state ($ per kWh, 2016 USD)"
gen lelec_price_init_avg = log(elec_price_init_avg)
la var lelec_price_init_avg "log of mean price of purchased electricity in entry year, industry, state ($ per kWh, 2016 USD)"
ren year firstyear

sum elec_price_init_avg, detail
bys state: sum elec_price_init_avg

sa "/projects/data/dataSTATA/economic/init_prices_avg.dta", replace

** initial energy price average by firstyear state 
use "/projects/data/dataSTATA/temp/combined_economic.dta", clear
keep if year == firstyear
* collapse by year state
collapse (mean) elec_price, by(year state)
ren elec_price elec_price_init_avg2
la var elec_price_init_avg2 "mean price of purchased electricity in entry year, state ($ per kWh, 2016 USD)"
gen lelec_price_init_avg2 = log(elec_price_init_avg)
la var lelec_price_init_avg2 "log of mean price of purchased electricity in entry year, state ($ per kWh, 2016 USD)"
ren year firstyear

sum elec_price_init_avg2, detail
bys state: sum elec_price_init_avg2

sa "/projects/data/dataSTATA/economic/init_prices_avg2.dta", replace

*>------------------------------ build IV -------------------------------------<
*>-------------------- using 1976 shares --------------------------------------<
use "/projects/data/dataSTATA/temp/combined_working.dta", clear
merge m:1 firstyear lbdnum               using "/projects/data/dataSTATA/economic/init_prices_i.dta",   assert(1 2 3) keep(1 3) nogen
merge m:1 firstyear bestnaics state      using "/projects/data/dataSTATA/economic/init_prices_avg.dta", assert(1 2 3) keep(1 3) nogen
merge m:1 firstyear state                using "/projects/data/dataSTATA/economic/init_prices_avg2.dta", assert(1 2 3) keep(1 3) nogen
replace elec_price_init  = elec_price_init_avg    if mi(elec_price_init)
replace elec_price_init  = elec_price_init_avg2   if mi(elec_price_init)
drop lelec_price*
gen lelec_price_init = log(elec_price_init)
gen lelec_price	 = log(elec_price)

gen wt_cmf = wt
replace wt_cmf = 1 if inlist(year, 1977, 1982,1987,1992,1997,2002,2007,2012) 

foreach var in B V {
	foreach var2 in cl ng pa {
		gen L0elecShare`var'_`var2'_1976 = L0elecShare`var'_`var2' if year == 1976
		bys state: egen elecShare`var'_`var2'_1976 = mean(L0elecShare`var'_`var2'_1976)
	}
}

* instrument that combines all fuel shares
foreach var in B V {
	gen instr_`var' = efPriceNationalcl*elecShare`var'_cl_1976 + efPriceNationalng * elecShare`var'_ng_1976  + efPriceNationalpa*elecShare`var'_pa_1976 
  }

  foreach var in B V {
	gen instr_`var'_init = efPriceNationalcl_init*elecShare`var'_cl_1976 + efPriceNationalng_init * elecShare`var'_ng_1976  + efPriceNationalpa_init*elecShare`var'_pa_1976 
  }
  

* instrument that uses each fuel share separately
foreach var in B V {
	foreach var2 in cl ng pa{
		
		gen instr_`var'_`var2' = efPriceNational`var2'*elecShare`var'_`var2'_1976
		gen instr_`var'_`var2'_init = efPriceNational`var2'_init*elecShare`var'_`var2'_1976
	}
} 

* identify industries with few raw fuels
gen elec_perc = ee / (ee + cf)
la var elec_perc "Share of Total Energy Costs from Electricity"
bys bestnaics: egen elec_perc_ind = mean(elec_perc) 
la var elec_perc_ind "Industry Average Share of Total Energy Costs from Electricity, All Years"

* other cleaning
drop ef* L* *dfltr naics97 sic87 elecShare* *_avg *_avg2 wt
la var bestsic4 "SIC 1987 Rev."
la var bestsic "SIC code from raw data (1972 or 1987 Rev.)"
la var lelec_price "log of elec_price"
la var lelec_price_init "log of elec_price_init"
la var wt_cmf "Sample Weight"
destring zip, replace
la var zip "ZIP code, 9-digit"
la var lastyear "exit year"
replace lastyear = . if lastyear == 2016


sa "/projects/data/dataSTATA/combined/combined_iv.dta", replace


log close
